{
    "cells": [
        {
            "attachments": {},
            "cell_type": "markdown",
            "id": "b1c1ebaa-50de-4851-a720-acbb977551ea",
            "metadata": {},
            "source": [
                "# Recency Filtering\n",
                "\n",
                "Showcase capabilities of recency-weighted node postprocessor"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 1,
            "id": "92d06b38-2103-4a40-93c3-60e0708a1124",
            "metadata": {},
            "outputs": [
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "/Users/jerryliu/Programming/llama_index/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
                        "  from .autonotebook import tqdm as notebook_tqdm\n"
                    ]
                }
            ],
            "source": [
                "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
                "from llama_index.indices.postprocessor import (\n",
                "    FixedRecencyPostprocessor,\n",
                "    EmbeddingRecencyPostprocessor,\n",
                ")\n",
                "from llama_index.node_parser import SimpleNodeParser\n",
                "from llama_index.storage.docstore import SimpleDocumentStore\n",
                "from llama_index.response.notebook_utils import display_response"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "67020156-2975-4bbb-8e98-afc55abb3d72",
            "metadata": {},
            "source": [
                "### Parse Documents into Nodes, add to Docstore\n",
                "\n",
                "In this example, there are 3 different versions of PG's essay. They are largely identical **except** \n",
                "for one specific section, which details the amount of funding they raised for Viaweb. \n",
                "\n",
                "V1: 50k, V2: 30k, V3: 10K\n",
                "\n",
                "V1: 2020-01-01, V2: 2020-02-03, V3: 2022-04-12\n",
                "\n",
                "The idea is to encourage index to fetch the most recent info (which is V3)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 2,
            "id": "caddd84e-9827-40a4-9520-dba6405fd1fd",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "# load documents\n",
                "from llama_index.storage.storage_context import StorageContext\n",
                "\n",
                "\n",
                "def get_file_metadata(file_name: str):\n",
                "    \"\"\"Get file metadata.\"\"\"\n",
                "    if \"v1\" in file_name:\n",
                "        return {\"date\": \"2020-01-01\"}\n",
                "    elif \"v2\" in file_name:\n",
                "        return {\"date\": \"2020-02-03\"}\n",
                "    elif \"v3\" in file_name:\n",
                "        return {\"date\": \"2022-04-12\"}\n",
                "    else:\n",
                "        raise ValueError(\"invalid file\")\n",
                "\n",
                "\n",
                "documents = SimpleDirectoryReader(\n",
                "    input_files=[\n",
                "        \"test_versioned_data/paul_graham_essay_v1.txt\",\n",
                "        \"test_versioned_data/paul_graham_essay_v2.txt\",\n",
                "        \"test_versioned_data/paul_graham_essay_v3.txt\",\n",
                "    ],\n",
                "    file_metadata=get_file_metadata,\n",
                ").load_data()\n",
                "\n",
                "# define service context (wrapper container around current classes)\n",
                "service_context = ServiceContext.from_defaults(chunk_size=512)\n",
                "\n",
                "# use node parser in service context to parse into nodes\n",
                "nodes = service_context.node_parser.get_nodes_from_documents(documents)\n",
                "\n",
                "# add to docstore\n",
                "docstore = SimpleDocumentStore()\n",
                "docstore.add_documents(nodes)\n",
                "\n",
                "storage_context = StorageContext.from_defaults(docstore=docstore)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "191ced40-80f4-40e7-bf31-0c9a5a664cf2",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "print(documents[2].get_text())"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "e5a25b95-de5e-4e56-a846-51e9c6eba181",
            "metadata": {},
            "source": [
                "### Build Index"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 3,
            "id": "5f7f68d6-2389-4f6c-bc4e-8612a1a53fb8",
            "metadata": {
                "tags": []
            },
            "outputs": [
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
                        "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 84471 tokens\n"
                    ]
                }
            ],
            "source": [
                "# build index\n",
                "index = VectorStoreIndex(nodes, storage_context=storage_context)"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "86c5e8aa-18d8-4229-b7b2-a1c97c11a09a",
            "metadata": {},
            "source": [
                "### Define Recency Postprocessors"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 6,
            "id": "ba5e10c9-5a7e-4ea8-a74d-0e0f74b5cd1b",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "node_postprocessor = FixedRecencyPostprocessor(service_context=service_context)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 7,
            "id": "94f44f2b-d816-43a0-87dc-ea8eefc7d534",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "node_postprocessor_emb = EmbeddingRecencyPostprocessor(service_context=service_context)"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "efcfffe4-a8aa-486d-b46d-f73f985dffca",
            "metadata": {},
            "source": [
                "### Query Index"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 8,
            "id": "78d6c3db-61e6-4d9a-a84d-d7be846b4112",
            "metadata": {
                "tags": []
            },
            "outputs": [
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 1813 tokens\n",
                        "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens\n"
                    ]
                }
            ],
            "source": [
                "# naive query\n",
                "\n",
                "query_engine = index.as_query_engine(\n",
                "    similarity_top_k=3,\n",
                ")\n",
                "response = query_engine.query(\n",
                "    \"How much did the author raise in seed funding from Idelle's husband (Julian) for Viaweb?\",\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "1d672c52-c0ac-4e5f-9175-855e66eb97ba",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "# query using fixed recency node postprocessor\n",
                "\n",
                "query_engine = index.as_query_engine(\n",
                "    similarity_top_k=3, node_postprocessors=[node_postprocessor]\n",
                ")\n",
                "response = query_engine.query(\n",
                "    \"How much did the author raise in seed funding from Idelle's husband (Julian) for Viaweb?\",\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 12,
            "id": "bc1328c1-23b2-406c-b80b-6d97bffc33ae",
            "metadata": {
                "tags": []
            },
            "outputs": [
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 541 tokens\n",
                        "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens\n"
                    ]
                }
            ],
            "source": [
                "# query using embedding-based node postprocessor\n",
                "\n",
                "query_engine = index.as_query_engine(\n",
                "    similarity_top_k=3, node_postprocessors=[node_postprocessor]\n",
                ")\n",
                "response = query_engine.query(\n",
                "    \"How much did the author raise in seed funding from Idelle's husband (Julian) for Viaweb?\",\n",
                ")"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "dd00cc97-4de7-4c61-9c0c-3f9ee3598528",
            "metadata": {},
            "source": [
                "### Query Index (Lower-Level Usage)\n",
                "\n",
                "In this example we first get the full set of nodes from a query call, and then send to node postprocessor, and then\n",
                "finally synthesize response through a summary index."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 18,
            "id": "350b039e-d45d-4b6b-957a-4b14d8816cbd",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "from llama_index import SummaryIndex"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 19,
            "id": "234f909f-6faa-43e6-96f8-0966699c9552",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "query_str = \"How much did the author raise in seed funding from Idelle's husband (Julian) for Viaweb?\""
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 20,
            "id": "20afbf6b-9473-446e-b522-b90fef2e3bf0",
            "metadata": {
                "tags": []
            },
            "outputs": [
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens\n",
                        "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens\n"
                    ]
                }
            ],
            "source": [
                "query_engine = index.as_query_engine(similarity_top_k=3, response_mode=\"no_text\")\n",
                "init_response = query_engine.query(\n",
                "    query_str,\n",
                ")\n",
                "resp_nodes = [n.node for n in init_response.source_nodes]"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 22,
            "id": "cdc03574-a806-4255-953c-6f82fc3f202f",
            "metadata": {
                "tags": []
            },
            "outputs": [
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
                        "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n",
                        "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 541 tokens\n",
                        "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n"
                    ]
                }
            ],
            "source": [
                "summary_index = SummaryIndex(resp_nodes)\n",
                "query_engine = summary_index.as_query_engine(node_postprocessors=[node_postprocessor])\n",
                "response = query_engine.query(query_str)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "f371e105-ad93-491c-ad27-35b3e34382f3",
            "metadata": {},
            "outputs": [],
            "source": []
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "llama_index",
            "language": "python",
            "name": "llama_index"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.10.10"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 5
}
